# This script prepares the different imputed, oversampled and undersampled training datasets to develop the early life CAPE models. 
# These early life training datasets prepared in this script will have had the following optimisation techniques applied: MICE imputation > ADASYN oversampling > random undersampling to give 1:1 class balance
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "MICE_imputed_standardised_earlylife_training_dataset_1113ID.csv" is found in IOWBC_imputed_data.xlsx, sheet: "Standardised MICE early life training"
# The data in files named "MICE_imputed_oversampled_earlylife_dataset_XXX.csv" were developed using the script "Data_preparation_CAPE_imputation_oversampling.txt (data can be found in XXX).
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.utils import shuffle

# Import datasets
data_0 = pd.read_csv("MICE_imputed_standardised_earlylife_training_dataset_1113ID.csv", index_col=False)
del data_0['Unnamed: 0']
# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:167,]
data_0b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0b = shuffle(data_0b, random_state=123)
print('Original dataset shape %s' % Counter(data_0b.Asthma_10YR))
# Original dataset shape Counter({1: 167, 0: 167})

data_25 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_25%.csv", index_col=False)
data_25 = data_25.iloc[0:1155,:]
# Undersample the controls 
s1 = data_25.loc[data_25['Asthma_10YR'] == 1]
s0 = data_25.loc[data_25['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:209,]
data_25b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_25b = shuffle(data_25b, random_state=123)
print('Original dataset shape %s' % Counter(data_25b.Asthma_10YR))
# Original dataset shape Counter({1: 209, 0: 209})


data_50 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_50%.csv", index_col=False)
data_50 = data_50.iloc[0:1197,:]
# Undersample the controls 
s1 = data_50.loc[data_50['Asthma_10YR'] == 1]
s0 = data_50.loc[data_50['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:251,]
data_50b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_50b = shuffle(data_50b, random_state=123)
print('Original dataset shape %s' % Counter(data_50b.Asthma_10YR))
# Original dataset shape Counter({1: 251, 0: 251})


data_100 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_100%.csv", index_col=False)
data_100 = data_100.iloc[0:1280,:]
# Undersample the controls 
s1 = data_100.loc[data_100['Asthma_10YR'] == 1]
s0 = data_100.loc[data_100['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:334,]
data_100b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_100b = shuffle(data_100b, random_state=123)
print('Original dataset shape %s' % Counter(data_100b.Asthma_10YR))
# Original dataset shape Counter({1: 334, 0: 334})

data_150 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_150%.csv", index_col=False)
data_150 = data_150.iloc[0:1364,:]
# Undersample the controls 
s1 = data_150.loc[data_150['Asthma_10YR'] == 1]
s0 = data_150.loc[data_150['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:418,]
data_150b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_150b = shuffle(data_150b, random_state=123)
print('Original dataset shape %s' % Counter(data_150b.Asthma_10YR))
# Original dataset shape Counter({1: 418, 0: 418})

data_200 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_200%.csv", index_col=False)
data_200 = data_200.iloc[0:1447,:]
# Undersample the controls 
s1 = data_200.loc[data_200['Asthma_10YR'] == 1]
s0 = data_200.loc[data_200['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:501,]
data_200b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_200b = shuffle(data_200b, random_state=123)
print('Original dataset shape %s' % Counter(data_200b.Asthma_10YR))
# Original dataset shape Counter({1: 501, 0: 501})

data_250 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_250%.csv", index_col=False)
data_250 = data_250.iloc[0:1531,:]
# Undersample the controls 
s1 = data_250.loc[data_250['Asthma_10YR'] == 1]
s0 = data_250.loc[data_250['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:585,]
data_250b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_250b = shuffle(data_250b, random_state=123)
print('Original dataset shape %s' % Counter(data_250b.Asthma_10YR))
# Original dataset shape Counter({1: 585, 0: 585})

data_300 = pd.read_csv("MICE_imputed_oversampled_earlylife_dataset_300%.csv", index_col=False)
data_300 = data_300.iloc[0:1614,:]
# Undersample the controls 
s1 = data_300.loc[data_300['Asthma_10YR'] == 1]
s0 = data_300.loc[data_300['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:668,]
data_300b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300b = shuffle(data_300b, random_state=123)
print('Original dataset shape %s' % Counter(data_300b.Asthma_10YR))
# Original dataset shape Counter({1: 668, 0: 668})


# Assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0b);
data.append(data_25b);
data.append(data_50b);
data.append(data_100b);
data.append(data_150b);
data.append(data_200b);
data.append(data_250b);
data.append(data_300b)

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset.  
set = [0,1,2,3,4,5,6,7]

# Import early life test data, standardised against the imputed early life training dataset - data found in IOWBC_imputed_data.xlsx, sheet: "Standardised earlylife test set"
test = pd.read_csv("Early_life_MICE_standardised_test_dataset_255IDs.csv", index_col=False)
del test['Unnamed: 0']
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']


